In [1]:
import pandas as pd
import itertools
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
folder_path = '../assets/Assignment_Data_2023-2024/Assignment_Data_2023-2024/'
club_games_df = pd.read_csv(folder_path+'club_games.csv')
clubs_df = pd.read_csv(folder_path+'clubs.csv')
competition_df = pd.read_csv(folder_path+'competitions.csv')
game_lineup_df = pd.read_csv(folder_path+'game_lineups.csv')
players_df = pd.read_csv(folder_path+'players.csv')
appearances_df = pd.read_csv(folder_path+'appearances.csv')
In [2]:
'''
in dataset to abbreviate millions there is m near the number, we have to clean that in order to make the conversion
to a number
also excluding negative values
'''
# Clean the 'net_transfer_record' column and convert to numeric
clubs_df['net_transfer_record'] = clubs_df['net_transfer_record'].replace(r'[\$,€m]', '', regex=True)
clubs_df['net_transfer_record'] = pd.to_numeric(clubs_df['net_transfer_record'], errors='coerce')
# Merge the dataframes on competition ID
merged_df = pd.merge(clubs_df, competition_df[['competition_id', 'competition_code']], left_on='domestic_competition_id', right_on='competition_id')
# Group by competition code, club name, and last season, and sum the net transfer records
club_transfer_analysis = merged_df.groupby(['competition_code', 'name', 'last_season'])['net_transfer_record'].sum().reset_index()
def process_and_plot(group):
pivot_table = group.pivot(index='last_season', columns='name', values='net_transfer_record')
# Select last 3 seasons, handle cases with fewer than 3 seasons
if len(pivot_table.index) > 2:
last_3_years = pivot_table.index[-3:]
else:
last_3_years = pivot_table.index
pivot_table = pivot_table.loc[last_3_years]
# Replace negative values with NaN
pivot_table = pivot_table.where(pivot_table >= 0)
# Prepare data for Plotly
data = []
for column in pivot_table.columns:
trace = go.Scatter(
x=pivot_table.index,
y=pivot_table[column],
mode='lines+markers',
name=column
)
data.append(trace)
# Layout settings for the plot
layout = go.Layout(
title=f'Net Transfer Spending Over Seasons - {group.name}',
xaxis=dict(title='Season'),
yaxis=dict(title='Net Transfer Spending (in millions)'),
legend=dict(title='Club', x=1.05, y=1, orientation='v'),
margin=dict(l=50, r=50, t=80, b=50),
hovermode='closest',
showlegend=True
)
# Create and display the plot using Plotly
fig = go.Figure(data=data, layout=layout)
fig.show()
# Apply the function to each group in the groupby object
club_transfer_analysis.groupby('competition_code').apply(process_and_plot)
Out[2]:
Top 30 stadiums with the most seats¶
In [3]:
max_seats_per_team = clubs_df.groupby('name')['stadium_seats'].max()
max_seats_per_team = max_seats_per_team.sort_values(ascending=False)
# Select top 30 teams
top_stadiums = max_seats_per_team.head(30)
# Create Plotly bar chart
data = [
go.Bar(
x=top_stadiums.index,
y=top_stadiums.values,
marker=dict(color='skyblue')
)
]
layout = go.Layout(
title='Top 30 Stadium Capacity',
xaxis=dict(title='Club'),
yaxis=dict(title='Number of Seats'),
margin=dict(b=150), # Adjust bottom margin to accommodate x-axis labels
hovermode='closest',
)
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(tickangle=45, tickmode='array', tickvals=top_stadiums.index, ticktext=top_stadiums.index)
fig.show()
In [4]:
min_seats_per_team = clubs_df.groupby('name')['stadium_seats'].min()
min_seats_per_team = min_seats_per_team.sort_values(ascending=True)
# Select top 30 teams
top_stadiums = min_seats_per_team.head(30)
# Create Plotly bar chart
data = [
go.Bar(
x=top_stadiums.index,
y=top_stadiums.values,
marker=dict(color='lightgreen')
)
]
layout = go.Layout(
title='Top 30 Stadium Capacity (Lowest Seats Capacity)',
xaxis=dict(title='Club'),
yaxis=dict(title='Number of Seats'),
margin=dict(b=150), # Adjust bottom margin to accommodate x-axis labels
hovermode='closest',
)
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(tickangle=45, tickmode='array', tickvals=top_stadiums.index, ticktext=top_stadiums.index)
fig.show()
In [5]:
# data analysis of the club with the highest national team players
In [6]:
# Sort by number of national team players and select top 10
top_clubs = clubs_df.sort_values(by='national_team_players', ascending=False).head(20)
# Create Plotly bar chart
data = [
go.Bar(
x=top_clubs['name'],
y=top_clubs['national_team_players'],
marker=dict(color='blue')
)
]
layout = go.Layout(
title='Top 20 Clubs Worldwide with Most National Team Players',
xaxis=dict(title='Club'),
yaxis=dict(title='Number of National Team Players'),
margin=dict(b=150), # Adjust bottom margin to accommodate x-axis labels
hovermode='closest',
)
fig = go.Figure(data=data, layout=layout)
# Update x-axis tick angle for better readability
fig.update_xaxes(tickangle=45)
# Show plot
fig.show()
Data analysis of the largest player club¶
In [7]:
largest_club = clubs_df.sort_values(by='squad_size', ascending=False).head(5)
data = [
go.Bar(
x=largest_club['name'],
y=largest_club['squad_size'],
marker=dict(color='green')
)
]
layout = go.Layout(
title='Top 5 Clubs with Largest Squad Size',
xaxis=dict(title='Club'),
yaxis=dict(title='Squad Size'),
margin=dict(b=150), # Adjust bottom margin to accommodate x-axis labels
hovermode='closest',
)
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(tickangle=45)
fig.show()
Analysis of the clubs with the highest win rate and goal rate¶
In [8]:
import pandas as pd
import plotly.graph_objs as go
# Supponiamo che club_games_df e clubs_df siano già stati definiti e contengano i dati necessari
# Calcola il numero di vittorie per ciascun club
club_games_df['win'] = club_games_df['is_win'] == 1
wins_per_club = club_games_df.groupby('club_id')['win'].sum().reset_index()
wins_per_club.columns = ['club_id', 'num_wins']
# Calcola la media dei goal segnati per ciascun club
average_goals_per_club = club_games_df.groupby('club_id')['own_goals'].mean().reset_index()
average_goals_per_club.columns = ['club_id', 'average_goals']
# Unisci i dati delle vittorie e della media dei goal
club_stats = pd.merge(wins_per_club, average_goals_per_club, on='club_id')
club_stats = pd.merge(club_stats, clubs_df[['club_id', 'name']], on='club_id')
# Trova i top 5 club con il maggior numero di vittorie
top_5_clubs = club_stats.sort_values(by='num_wins', ascending=False).head(5)
# Crea un grafico a barre interattivo con Plotly
data = [
go.Bar(
x=top_5_clubs['name'],
y=top_5_clubs['num_wins'],
name='Number of Wins',
marker=dict(color='blue')
),
go.Bar(
x=top_5_clubs['name'],
y=top_5_clubs['average_goals'],
name='Average Goals',
marker=dict(color='green')
)
]
layout = go.Layout(
title='Top 5 Clubs by Number of Wins and Average Goals',
xaxis=dict(title='Club'),
yaxis=dict(title='Value'),
barmode='group'
)
fig = go.Figure(data=data, layout=layout)
# Mostra il grafico
fig.show()
Analysis of the club with the prizes player and the club with the most wins¶
In [9]:
import pandas as pd
import plotly.graph_objs as go
import plotly.subplots as sp
# Assumiamo che tu abbia già i dataframe players_df, clubs_df, appearances_df e matches_df
# Trovare il giocatore più pagato
max_market_value_player = players_df.loc[players_df['market_value_in_eur'].idxmax()]
max_market_value_team_id = max_market_value_player['current_club_id']
# Calcolare il numero totale di goal per ciascun team
total_goals_df = appearances_df.groupby('player_club_id')['goals'].sum().reset_index()
max_goals_team_id = total_goals_df.loc[total_goals_df['goals'].idxmax()]['player_club_id']
# Estrazione dei dati per i team interessati
max_market_value_team = clubs_df[clubs_df['club_id'] == max_market_value_team_id]
max_goals_team = clubs_df[clubs_df['club_id'] == max_goals_team_id]
# Funzione per estrarre dati per il grafico
def extract_team_data(team_df, team_id):
# Calcolare le vittorie per il team
team_wins =club_games_df[(club_games_df['club_id'] == team_id) & (club_games_df['is_win'] == 1)].shape[0]
return {
'squad_size': team_df['squad_size'].values[0],
'average_age': team_df['average_age'].values[0],
'yellow_cards': appearances_df[appearances_df['player_club_id'] == team_id]['yellow_cards'].sum(),
'red_cards': appearances_df[appearances_df['player_club_id'] == team_id]['red_cards'].sum(),
'wins': team_wins,
'losses': 0 # Supponiamo che non abbiamo informazioni sulle sconfitte
}
# Dati per i team più pagato e con più goal
max_market_value_team_data = extract_team_data(max_market_value_team, max_market_value_team_id)
max_goals_team_data = extract_team_data(max_goals_team, max_goals_team_id)
# Nome dei club
max_market_value_team_name = max_market_value_team['name'].values[0]
max_goals_team_name = max_goals_team['name'].values[0]
# Creare il grafico interattivo
fig = sp.make_subplots(
rows=1,
cols=2,
subplot_titles=(f"Team del Giocatore più Pagato ({max_market_value_team_name})", f"Team con più Goal ({max_goals_team_name})")
)
# Aggiungere dati al grafico
fig.add_trace(go.Bar(x=list(max_market_value_team_data.keys()), y=list(max_market_value_team_data.values()), name=max_market_value_team_name), row=1, col=1)
fig.add_trace(go.Bar(x=list(max_goals_team_data.keys()), y=list(max_goals_team_data.values()), name=max_goals_team_name), row=1, col=2)
# Aggiornare il layout del grafico
fig.update_layout(title_text="Confronto tra Team", showlegend=False)
# Visualizzare il grafico
fig.show()
data analysis of the club with the highest net transfer record for each competition with the most wins club for each competition¶
In [10]:
# 1. Calcoliamo i cartellini rossi e gialli per ogni squadra
red_cards = appearances_df.groupby('player_current_club_id')['red_cards'].sum()
yellow_cards = appearances_df.groupby('player_current_club_id')['yellow_cards'].sum()
# 2. Calcoliamo il numero di vittorie per ogni squadra
wins = club_games_df.groupby('club_id')['is_win'].sum()
# 3. Otteniamo la dimensione della squadra (squad_size)
squad_size = clubs_df.set_index('club_id')['squad_size']
# Uniamo tutte le metriche in un unico DataFrame per semplificare il plotting
data = pd.DataFrame({
'Red Cards': red_cards,
'Yellow Cards': yellow_cards,
'Wins': wins,
'Squad Size': squad_size
}).fillna(0) # Gestione dei NaN nel caso ci siano dati mancanti
# Uniamo anche il nome delle squadre per renderlo più leggibile nei grafici
data = data.join(clubs_df.set_index('club_id')['name'])
# Troviamo la squadra con più trasferimenti
team_with_most_transfers = data['Squad Size'].idxmax()
# Troviamo la squadra più vincente
team_with_most_wins = data['Wins'].idxmax()
import plotly.graph_objs as go
from plotly.subplots import make_subplots
# Filtra i dati per le due squadre principali di ogni campionato
team1_data = data[data.index == team_with_most_transfers]
team2_data = data[data.index == team_with_most_wins]
# Crea i subplot
fig = make_subplots(rows=2, cols=2, subplot_titles=("Red Cards", "Yellow Cards", "Wins", "Squad Size"))
# Aggiungi i grafici per ciascuna metrica
fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Red Cards'], name='Most Transfers'), row=1, col=1)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Red Cards'], name='Most Wins'), row=1, col=1)
fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Yellow Cards'], name='Most Transfers'), row=1, col=2)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Yellow Cards'], name='Most Wins'), row=1, col=2)
fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Wins'], name='Most Transfers'), row=2, col=1)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Wins'], name='Most Wins'), row=2, col=1)
fig.add_trace(go.Bar(x=team1_data['name'], y=team1_data['Squad Size'], name='Most Transfers'), row=2, col=2)
fig.add_trace(go.Bar(x=team2_data['name'], y=team2_data['Squad Size'], name='Most Wins'), row=2, col=2)
# Aggiungi titoli e nomi degli assi
fig.update_layout(title='Confronto tra Squadra con più trasferimenti e più vincente',
xaxis_title='Squadra',
yaxis_title='Valore',
barmode='group')
# Mostra il grafico
fig.show()
In [11]:
# Fusione dei dati delle partite con i dati dei club per ottenere i nomi dei club
merge_club_games_df = pd.merge(club_games_df, clubs_df, on='club_id')
# Effettua il merge con il DataFrame delle apparizioni basato su game_id o altre chiavi di collegamento appropriate
merge_club_games_appearances_df = pd.merge(merge_club_games_df, appearances_df[['game_id', 'yellow_cards', 'red_cards']], on='game_id', how='left')
# Conteggio delle vittorie per ciascun club in ogni competizione
wins_count_per_club = merge_club_games_appearances_df[merge_club_games_appearances_df['is_win'] == 1].groupby(['domestic_competition_id', 'club_id']).size().reset_index(name='wins_count')
# Trova il club con il massimo numero di vittorie per ogni competizione
clubs_with_most_wins_per_competition = wins_count_per_club.loc[wins_count_per_club.groupby('domestic_competition_id')['wins_count'].idxmax()]
# Inizializza un elenco di figure per ogni competizione
figures = []
# Ciclo su ogni competizione e crea una figura separata per ciascuna
for comp_id in clubs_with_most_wins_per_competition['domestic_competition_id']:
# Trova il nome della squadra con più trasferimenti e più vittorie per questa competizione
team_most_transfers = clubs_df[clubs_df['domestic_competition_id'] == comp_id]['club_id'].iloc[0]
team_most_wins = clubs_with_most_wins_per_competition[clubs_with_most_wins_per_competition['domestic_competition_id'] == comp_id]['club_id'].iloc[0]
# Filtra i dati per le due squadre di interesse
team1_data = merge_club_games_appearances_df[(merge_club_games_appearances_df['club_id'] == team_most_transfers) & (merge_club_games_appearances_df['domestic_competition_id'] == comp_id)].iloc[0]
team2_data = merge_club_games_appearances_df[(merge_club_games_appearances_df['club_id'] == team_most_wins) & (merge_club_games_appearances_df['domestic_competition_id'] == comp_id)].iloc[0]
# Crea una nuova figura per la competizione corrente
fig = make_subplots(rows=1, cols=5, subplot_titles=[f"Competizione {comp_id} - {clubs_df.loc[clubs_df['club_id'] == team_most_transfers, 'name'].iloc[0]} vs {clubs_df.loc[clubs_df['club_id'] == team_most_wins, 'name'].iloc[0]}"])
# Aggiungi i grafici per ogni metrica sulla competizione corrente
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team1_data['yellow_cards'], team2_data['yellow_cards']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Yellow Cards', marker_color='gold', legendgroup='team1'), row=1, col=1)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team1_data['red_cards'], team2_data['red_cards']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Red Cards', marker_color='indianred', legendgroup='team1'), row=1, col=2)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team1_data['is_win'], team2_data['is_win']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Wins', marker_color='mediumseagreen', legendgroup='team1'), row=1, col=3)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team1_data['total_market_value'], team2_data['total_market_value']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Total Market Value', marker_color='royalblue', legendgroup='team1'), row=1, col=4)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team1_data['squad_size'], team2_data['squad_size']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_transfers, "name"].iloc[0]} - Squad Size', marker_color='orange', legendgroup='team1'), row=1, col=5)
# Aggiungi i grafici per il secondo team con la stessa metrica
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team2_data['yellow_cards'], team1_data['yellow_cards']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Yellow Cards', marker_color='gold', legendgroup='team2'), row=1, col=1)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team2_data['red_cards'], team1_data['red_cards']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Red Cards', marker_color='indianred', legendgroup='team2'), row=1, col=2)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team2_data['is_win'], team1_data['is_win']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Wins', marker_color='mediumseagreen', legendgroup='team2'), row=1, col=3)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team2_data['total_market_value'], team1_data['total_market_value']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Total Market Value', marker_color='royalblue', legendgroup='team2'), row=1, col=4)
fig.add_trace(go.Bar(x=['Most Transfers', 'Most Wins'],
y=[team2_data['squad_size'], team1_data['squad_size']],
name=f'{clubs_df.loc[clubs_df["club_id"] == team_most_wins, "name"].iloc[0]} - Squad Size', marker_color='orange', legendgroup='team2'), row=1, col=5)
# Aggiungi la figura alla lista delle figure
figures.append(fig)
# Aggiungi i layout e mostra ogni figura separatamente
for fig in figures:
fig.update_layout(title='Confronto tra Squadra con più Trasferimenti e Squadra con più Vittorie',
showlegend=True, height=400, width=1000, barmode='group')
fig.update_xaxes(title_text="Metrica", row=1, col=3)
fig.update_yaxes(title_text="Valore", row=1, col=1)
fig.show()